### -------------------------------------------------- 
### ---- General Notes ---- 
### -------------------------------------------------- 

# 

### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
pacman::p_unload(all)
pacman::p_load(tidyverse, data.table, lazyeval, magrittr, lubridate, readstata13,
       textclean, here)


### File Location (R project only)
here::i_am("Survey Only Match/107_generate_state_flags.R")
file_path <- here()
file_path_data_matched <- here("Survey Only Match/Temp_Data/")


### Run file with functions for downloading VF files
source(paste0(file_path,"/Functions/functions_flag_states.R"))


### -------------------------------------------------- 
### ---- Download TFA Data Matched to State ----
### -------------------------------------------------- 

### Load tfa_dat_match from file #6
load(paste0(file_path_data_matched,"/tfa_to_state_ALL_v2.RData")) #changed name 


### -------------------------------------------------- 
### ---- Flag Obs Matched to State ----
### -------------------------------------------------- 

### -------------------------------------------------- 
### Create flags (dummy vars) for states
### -------------------------------------------------- 

### Create new copy of the tfa data
tfa_dat_flag <- tfa_dat_match

### Input
# List of state abbreviations 
data(state) #Download list of state abbreviations
state_abb_w_DC <- c("DC",state.abb) #Add DC to list of state abbreviations


# List variable names with state info (used in loop)
var_names <- c("state_ca", "state_uni", "state_st", 
               "state_mr", "state_cell")

### Run Loop 
for(s in 1:51){ #Run loop for all 50 states plus DC
  
  choose_ABB <- state_abb_w_DC[s] #Assign variable with one of 51 abbreviations
  
  for(v in 1:length(var_names)){ #Run for each of the 7 variables in var_names above
    # Function creates indicator variable for whether variable v equals a state s
    tfa_dat_flag <- fun_state_variable_flag(dat = tfa_dat_flag,
                                            ABB = choose_ABB,
                                            choose_var = var_names[v])
  }
  # Function creates (1) sum of total vars in var_names above that equal state s 
  # (2) indicator variable for whether any variable v equals a state s
  tfa_dat_flag <- fun_state_flag(dat = tfa_dat_flag,
                                 ABB = choose_ABB)
}

tfa_dat_flag %>% variable.names()


### -------------------------------------------------- 
### ---- Clean Vars to match VF ----
### -------------------------------------------------- 

temp <- tfa_dat_flag
#tfa_dat_flag <- temp

### -------------------------------------------------- 
### ***Names ----
### -------------------------------------------------- 

### Clean names
tfa_dat_flag %<>% 
  mutate(FirstName = toupper(firstname)) %>%
  mutate(LastName = toupper(lastname))


### -------------------------------------------------- 
### ***Gender ----
### -------------------------------------------------- 

### Check gender vars

# dem3 vs. Gender
tfa_dat %>% group_by(dem3) %>% summarize(total = round(n()/nrow(tfa_dat) *100,3))
tfa_dat %>% group_by(gender) %>% summarize(total = round(n()/nrow(tfa_dat) *100,3))
tfa_dat %>% group_by(dem3, gender) %>% summarize(total = round(n()/nrow(tfa_dat) *100,3))
# => Conclusion: gender from original app is better

### Recode Sex/gender to match vf
tfa_dat_flag %<>%
  mutate(Sex = NA) %>%
  mutate(Sex = ifelse(gender=="MALE","M",Sex) ) %>%
  mutate(Sex = ifelse(gender=="FEMALE","F",Sex) )


### -------------------------------------------------- 
### ***Dates (DoB and Graduation) ----
### -------------------------------------------------- 

### Use survey responses for birth year
tfa_dat_flag %<>%
  mutate(BirthYear=pre3_0)

### Clean graduation date
tfa_dat_flag %<>%
  # Reformat degree date (assume in 2000s if < 20)
    mutate(new_degreedate = as.Date(degreedate, "%d-%b-%y") ) %>%
    mutate(DegYear = strftime(new_degreedate, "19%y")) %>%
    mutate(DegYear = ifelse(as.numeric(DegYear)<1920,strftime(new_degreedate, "20%y"),DegYear)) %>%
    mutate(DegMonth = strftime(new_degreedate, "%m")) %>%
    mutate(DegDay = strftime(new_degreedate, "%d")) 


### -------------------------------------------------- 
### ***Phone Numbers ----
### -------------------------------------------------- 

### Clean phone numbers
tfa_dat_flag %<>%
  mutate(homephone_clean = gsub(" ", "", homephone)) %>%
  mutate(workphone_clean = gsub(" ", "", workphone)) %>%
  mutate(cellphone_clean = gsub(" ", "", cellphone)) %>%
  mutate(homephone_clean = textclean::strip(homephone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(workphone_clean = textclean::strip(workphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(cellphone_clean = textclean::strip(cellphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(homephone_area_code = str_sub(homephone_clean, start=1, end=3)) %>%
  mutate(workphone_area_code = str_sub(workphone_clean, start=1, end=3)) %>%
  mutate(cellphone_area_code = str_sub(cellphone_clean, start=1, end=3)) %>%
  mutate(homephone_number = str_sub(homephone_clean, start=4, end=10)) %>%
  mutate(workphone_number = str_sub(workphone_clean, start=4, end=10)) %>%
  mutate(cellphone_number = str_sub(cellphone_clean, start=4, end=10)) %>%
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)>10,NA,homephone_area_code)) %>%
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                      str_sub(homephone_clean, start=2, end=4),homephone_area_code)) %>% 
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                      str_sub(homephone_clean, start=3, end=5),homephone_area_code)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)>10,NA,homephone_number)) %>%
  mutate(homephone_number = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                   str_sub(homephone_clean, start=5, end=11),homephone_number)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                   str_sub(homephone_clean, start=6, end=12),homephone_number)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)>10,NA,workphone_area_code)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                      str_sub(workphone_clean, start=2, end=4),workphone_area_code)) %>% 
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                      str_sub(workphone_clean, start=3, end=5),workphone_area_code)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)>10,NA,workphone_number)) %>%
  mutate(workphone_number = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                   str_sub(workphone_clean, start=5, end=11),workphone_number)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                   str_sub(workphone_clean, start=6, end=12),workphone_number)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)>10,NA,cellphone_area_code)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                      str_sub(cellphone_clean, start=2, end=4),cellphone_area_code)) %>% 
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                      str_sub(cellphone_clean, start=3, end=5),cellphone_area_code)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)>10,NA,cellphone_number)) %>%
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                   str_sub(cellphone_clean, start=5, end=11),cellphone_number)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                   str_sub(cellphone_clean, start=6, end=12),cellphone_number))

### Standardize TFA Zipcodes
  
# Fix Unicode Errors
tfa_dat_flag[8903,"Current_Address.Zip"] <- "21224"
tfa_dat_flag[13899,"Current_Address.Zip"] <- "94115"
tfa_dat_flag[16007,"Current_Address.Zip"] <- "21224"
tfa_dat_flag[26012,"Current_Address.Zip"] <- "21209"
tfa_dat_flag[40815,"Current_Address.Zip"] <- NA

# Clean
tfa_dat_flag %<>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="-",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="-",NA,Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="N/A",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="N/A",NA,Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip_old = Current_Address.Zip,
         Permanent_Address.Zip_old = Permanent_Address.Zip) %>%
  mutate(Current_Address.Zip = ifelse(nchar(trimws(Current_Address.Zip))==10, 
                                      str_sub(Current_Address.Zip, start=1, end=5), 
                                              Current_Address.Zip)) %>%
  mutate(Permanent_Address.Zip = ifelse(nchar(trimws(Permanent_Address.Zip))==10, 
                                        str_sub(Permanent_Address.Zip, start=1, end=5), 
                                                Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip = ifelse(nchar(trimws(Current_Address.Zip))<5, 
                                      formatC(as.numeric(Current_Address.Zip), width=5, flag="0"), 
                                      Current_Address.Zip)) %>%
  mutate(Permanent_Address.Zip = ifelse(nchar(trimws(Permanent_Address.Zip))<5, 
                                        formatC(as.numeric(Permanent_Address.Zip), width=5, flag="0"),  
                                                Permanent_Address.Zip)) 


### Standardize TFA Dataset
tfa_dat_flag %<>%
  mutate(LandLine_AreaCode = ifelse(trimws(homephone_area_code)=="",NA,homephone_area_code)) %>% 
  mutate(LandLine_Number = ifelse(trimws(homephone_number)=="",NA,homephone_number)) %>%
  mutate(CellPhone_AreaCode = ifelse(trimws(cellphone_area_code)=="",NA,cellphone_area_code)) %>%
  mutate(CellPhone_Number = ifelse(trimws(cellphone_number)=="",NA,cellphone_number)) %>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="",NA,Permanent_Address.Zip)) 


### -------------------------------------------------- 
### ---- CHUNKING VARS ----
### -------------------------------------------------- 

tfa_dat_flag %<>%
  mutate(first_letter = str_to_upper(str_sub(LastName, start=1, end=1)))


### -------------------------------------------------- 
### ---- Save Dataset ----
### -------------------------------------------------- 

### Save tfa_dat_flag
save(tfa_dat_flag, 
     file=paste0(file_path_data_matched,"/tfa_to_state_ALL_flags_v2.RData")) #changed name 

### Save stata version for final analysis 
tfa_dat_flag_stata <- tfa_dat_flag %>%
  select(-starts_with('state_st_')) %>%
  select(-matches('^state_.*_total$')) %>%
  select(-matches('^state_.*_flag$')) 

tfa_dat_flag_stata$new_dateofbirth %>% is.na %>% sum

temp <- tfa_dat_flag
names(temp) <- gsub("\\.", "_", names(temp))
haven::write_dta(temp, paste0(file_path_data_matched,"/tfa_to_state_ALL_flags_v2.dta"))



